# Basic libraries
library(ggplot2)
library(dplyr)
library(tidyverse)
library(corrplot)
library(ggcorrplot)
library(lubridate)
library(jsonlite)
library(stringi)
# Text mining libs
library(SnowballC)
library(tidytext)
library(spacyr)
library(tm)
library(wordcloud2)
library(ggraph)
library(textstem)
library(ggridges)
# Read in the Data from a CSV file.
df <- read.csv('YouTube-videos.csv') #colClasses=c("headline"="character")
# Remove unwanted columns
df$thumbnail_link <- NULL
df$comments_disabled <- NULL
df$video_error_or_removed <- NULL
df$ratings_disabled <- NULL
# Convert Dates
df$trending_date <- as.Date(df$trending_date, "%y.%d.%m")
df$publish_time <- as.Date(df$publish_time, "%Y-%m-%d")
# Change Others to Factors
df$category_id <- as.factor(df$category_id)
# Import the YouTube Category Names
cats <- fromJSON("youtubeVideoCatUS.json", flatten = TRUE)
cats <- as.data.frame(cats)
# Create a new column that contains the English name of the category based on the Category ID
df$category_name <- cats$items.snippet.title[match(df$category_id, cats$items.id)]
df$category_name <- as.factor(df$category_name)
Cleaning up the NAME? in video_id
df %>%
filter(video_id == '#NAME?') %>%
summarize(total_records = n())
Replace each occurrence of #NAME? with a random generated string.
df$video_id[df$video_id == '#NAME?'] <- stri_rand_strings(1, 11)
df$video_id <- as.factor(df$video_id)
Perform Exploratory Data Analysis to better understand the data.
str(df)
## 'data.frame': 40881 obs. of 13 variables:
## $ video_id : Factor w/ 24104 levels "__4c1JCHvaQ",..: 14219 585 2731 6677 1593 846 321 1464 11920 2086 ...
## $ trending_date: Date, format: "2017-11-14" "2017-11-14" ...
## $ title : chr "Eminem - Walk On Water (Audio) ft. Beyoncé" "PLUSH - Bad Unboxing Fan Mail" "Racist Superman | Rudy Mancuso, King Bach & Lele Pons" "I Dare You: GOING BALD!?" ...
## $ channel_title: chr "EminemVEVO" "iDubbbzTV" "Rudy Mancuso" "nigahiga" ...
## $ category_id : Factor w/ 17 levels "1","2","10","15",..: 3 9 9 10 3 11 9 8 10 8 ...
## $ publish_time : Date, format: "2017-11-10" "2017-11-13" ...
## $ tags : chr "Eminem|\"Walk\"|\"On\"|\"Water\"|\"Aftermath/Shady/Interscope\"|\"Rap\"" "plush|\"bad unboxing\"|\"unboxing\"|\"fan mail\"|\"idubbbztv\"|\"idubbbztv2\"|\"things\"|\"best\"|\"packages\"|"| __truncated__ "racist superman|\"rudy\"|\"mancuso\"|\"king\"|\"bach\"|\"racist\"|\"superman\"|\"love\"|\"rudy mancuso poo bear"| __truncated__ "ryan|\"higa\"|\"higatv\"|\"nigahiga\"|\"i dare you\"|\"idy\"|\"rhpc\"|\"dares\"|\"no truth\"|\"comments\"|\"com"| __truncated__ ...
## $ views : int 17158579 1014651 3191434 2095828 33523622 1309699 2987945 748374 4477587 505161 ...
## $ likes : int 787425 127794 146035 132239 1634130 103755 187464 57534 292837 4135 ...
## $ dislikes : int 43420 1688 5339 1989 21082 4613 9850 2967 4123 976 ...
## $ comment_count: int 125882 13030 8181 17518 85067 12143 26629 15959 36391 1484 ...
## $ description : chr "Eminem's new track Walk on Water ft. Beyoncé is available everywhere: http://shady.sr/WOWEminem \\nPlaylist Bes"| __truncated__ "STill got a lot of packages. Probably will last for another year. On a side note, more 2nd channel vids soon. e"| __truncated__ "WATCH MY PREVIOUS VIDEO ▶ \\n\\nSUBSCRIBE ► https://www.youtube.com/channel/UC5jkXpfnBhlDjqh0ir5FsIQ?sub_confir"| __truncated__ "I know it's been a while since we did this show, but we're back with what might be the best episode yet!\\nLeav"| __truncated__ ...
## $ category_name: Factor w/ 17 levels "Autos & Vehicles",..: 9 2 2 4 9 10 2 12 4 12 ...
df %>%
select(video_id, trending_date, publish_time, views, likes, dislikes, comment_count, category_name) %>%
summary()
## video_id trending_date publish_time
## 3wwEC4Co4j6: 525 Min. :2017-11-14 Min. :2008-01-13
## 6ZfuNTqbHE8: 8 1st Qu.:2018-01-04 1st Qu.:2018-01-02
## l_lblj8Cq0o: 8 Median :2018-02-26 Median :2018-02-24
## UceaB4D0jpo: 8 Mean :2018-02-27 Mean :2018-02-23
## VYOjWnS4cMY: 8 3rd Qu.:2018-04-24 3rd Qu.:2018-04-23
## 7X_WvGAhMlQ: 7 Max. :2018-06-14 Max. :2018-06-14
## (Other) :40317
## views likes dislikes comment_count
## Min. : 733 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 143902 1st Qu.: 2191 1st Qu.: 99 1st Qu.: 417
## Median : 371204 Median : 8780 Median : 303 Median : 1301
## Mean : 1147036 Mean : 39583 Mean : 2009 Mean : 5043
## 3rd Qu.: 963302 3rd Qu.: 28717 3rd Qu.: 950 3rd Qu.: 3713
## Max. :137843120 Max. :5053338 Max. :1602383 Max. :1114800
##
## category_name
## Entertainment :13451
## News & Politics: 4159
## People & Blogs : 4105
## Comedy : 3773
## Music : 3731
## Sports : 2787
## (Other) : 8875
525 as the prior “$NAME?” value.head(df)
tail(df)
df %>%
ggplot(aes(x = views)) +
geom_histogram(color = "lightblue3", fill = "lightblue", bins = 30) +
scale_x_continuous(trans='log10') +
theme_minimal() +
theme(
plot.title = element_text(face = "bold")) +
labs(
x = NULL, y = "Count (Log 10)",
title = "Distribution of Views",
subtitle = "Using a log10 transformation the x-axis"
)
df %>%
group_by(category_name) %>%
summarise(total_views = mean(views), .groups = "keep") %>%
arrange(desc(total_views)) %>%
ggplot(aes(x= category_name, y = total_views)) +
geom_col(color = "lightblue3", fill = "lightblue") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold")) +
labs(
x = NULL, y = "Count",
title = "Mean Number of Videos Views per Category"
) +
coord_flip()
df %>%
ggplot(aes(x=views, color=category_name, fill=category_name)) +
geom_histogram(alpha=0.6, bins = 30) +
scale_x_continuous(trans='log10') +
theme_minimal() +
theme(
legend.position="none",
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 8)
) +
xlab("") +
ylab("") +
facet_wrap(~category_name)
df %>%
ggplot(aes(views, color = category_name)) +
geom_boxplot() +
scale_x_continuous(trans='log10') +
labs(title = "Video Views Distribution by Category",
x = "Views (Log10 Transformation)",
y = NULL) +
theme_minimal() +
theme(
legend.title = element_blank(),
legend.position = "right",
plot.title = element_text(face = "bold"))
df %>%
mutate(mon = floor_date(trending_date, 'month')) %>%
group_by(mon, category_name) %>%
summarize(total = mean(views), .groups = 'keep') %>%
ggplot(aes(x=mon, y=total, fill=category_name)) +
geom_col(color = "black") +
scale_x_date(date_breaks = "1 month", expand = c(0,0), date_labels = "%b-%y") +
theme_classic() +
theme(
plot.title = element_text(face = "bold")) +
labs(
x = "Trending Date", y = "Views",
title = "Mean Number of Views per Category",
subtitle = "Grouped by Month for Trending Date"
)
df %>%
ggplot(aes(x = likes)) +
geom_histogram(color = "gray", fill = "lightgray", bins = 30) +
scale_x_continuous(trans='log10') +
theme_minimal() +
theme(
plot.title = element_text(face = "bold")) +
labs(
x = NULL, y = "Count (Log 10)",
title = "Distribution of Likes",
subtitle = "Using a log10 transformation the x-axis"
)
df %>%
ggplot(aes(x=likes, color=category_name, fill=category_name)) +
geom_histogram(alpha=0.6, bins = 30) +
scale_x_continuous(trans='log10') +
theme_minimal() +
theme(
legend.position="none",
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 8)
) +
xlab("") +
ylab("") +
facet_wrap(~category_name)
df %>%
mutate(mon = floor_date(trending_date, 'month')) %>%
group_by(mon, category_name) %>%
summarize(total = mean(likes), .groups = 'keep') %>%
ggplot(aes(x=mon, y=total, fill=category_name)) +
geom_col(color = "black") +
scale_x_date(date_breaks = "1 month", expand = c(0,0), date_labels = "%b-%y") +
theme_classic() +
theme(
plot.title = element_text(face = "bold")) +
labs(
x = "Trending Date", y = "Likes",
title = "Number of Likes per Category",
subtitle = "Grouped by Month for Trending Date"
)
df %>%
ggplot(aes(x = dislikes)) +
geom_histogram(color = "gray", fill = "lightgray", bins = 30) +
scale_x_continuous(trans='log10') +
theme_minimal() +
theme(
plot.title = element_text(face = "bold")) +
labs(
x = NULL, y = "Count (Log 10)",
title = "Distribution of Dislikes",
subtitle = "Using a log10 transformation the x-axis"
)
df %>%
mutate(mon = floor_date(trending_date, 'month')) %>%
group_by(mon, category_name) %>%
summarize(total = mean(dislikes), .groups = 'keep') %>%
ggplot(aes(x=mon, y=total, fill=category_name)) +
geom_col(color = "black") +
scale_x_date(date_breaks = "1 month", expand = c(0,0), date_labels = "%b-%y") +
theme_classic() +
theme(
plot.title = element_text(face = "bold")) +
labs(
x = "Trending Date", y = "Dislikes",
title = "Number of Dislikes per Category",
subtitle = "Grouped by Month for Trending Date"
)
df %>%
filter(trending_date >= "01-01-2018" & trending_date > "02-01-2018" & category_name == "People & Blogs") %>%
arrange(desc(dislikes))
The following are the top five videos based on Likes. We can see that the top 5 videos are all the same, but with different trending dates.
df %>%
arrange(desc(likes)) %>%
select(trending_date, title, likes) %>%
head(n=5)
The following are the top five unique videos based on likes.
df %>%
arrange(desc(likes)) %>%
distinct(video_id, .keep_all = TRUE) %>%
select(trending_date, title, likes) %>%
head(n=5)
The following are the top five unique videos based on views.
df %>%
arrange(desc(views)) %>%
distinct(video_id, .keep_all = TRUE) %>%
select(trending_date, title, views) %>%
head(n=5)
df_corr = df %>%
select_if(is.numeric) %>%
# reordering the numeric columns so likes is listed first.
select(likes, views, dislikes, comment_count)
In order to find out which numeric values correlate to likes, we can create a correlation matrix and correlation plot.
corr <- round(cor(df_corr), 2)
corr
## likes views dislikes comment_count
## likes 1.00 0.83 0.46 0.84
## views 0.83 1.00 0.56 0.69
## dislikes 0.46 0.56 1.00 0.64
## comment_count 0.84 0.69 0.64 1.00
We can see that all of the other numeric values positively correlate to likes with comment_count being the strongest at 0.84, views being the next strongest at 0.83, dislikes at 0.46
Next we can see a visualization of these values.
ggcorrplot(corr, colors = c("#6D9EC1", "white", "#E46726"),lab = TRUE, ggtheme = ggplot2::theme_gray)
### Scatterplot
df %>%
ggplot(aes(x = likes, y = comment_count)) +
geom_point(alpha = 0.1, color = "#E46726") +
theme_minimal() +
scale_x_continuous(trans='log10') +
scale_y_continuous(trans='log10') +
geom_smooth(method = "lm", se = TRUE, formula = y ~ x, color = "black", linetype = 'dashed') +
labs(x = "Views (Log 10 Scale)",
y = "Likes (Log 10 Scale)",
title = "Correlation of Likes vs. Views")
After transforming the data on both axes with a log 10 scale, we can see the very linear, positive relationship between the dependent variable Likes, and our strongest correlated independent variable, comment_count.
Next we’ll create a simple linear regression model and a multiple linear regression model and test which performs better.
fit1 <- lm(formula = log1p(likes) ~ log1p(comment_count), data = df)
summary(fit1)
##
## Call:
## lm(formula = log1p(likes) ~ log1p(comment_count), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.3344 -0.6043 0.0959 0.6852 10.1327
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.875172 0.023547 122.1 <2e-16 ***
## log1p(comment_count) 0.858186 0.003238 265.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.271 on 40879 degrees of freedom
## Multiple R-squared: 0.6322, Adjusted R-squared: 0.6322
## F-statistic: 7.026e+04 on 1 and 40879 DF, p-value: < 2.2e-16
fit2 <- lm(formula = log1p(likes) ~ log1p(views) + log1p(comment_count) + log1p(dislikes), data = df)
summary(fit2)
##
## Call:
## lm(formula = log1p(likes) ~ log1p(views) + log1p(comment_count) +
## log1p(dislikes), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.2157 -0.5824 0.1084 0.6662 5.0774
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.235145 0.060428 -3.891 9.99e-05 ***
## log1p(views) 0.312152 0.006780 46.040 < 2e-16 ***
## log1p(comment_count) 0.415454 0.003919 106.017 < 2e-16 ***
## log1p(dislikes) 0.387105 0.005751 67.316 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.019 on 40877 degrees of freedom
## Multiple R-squared: 0.7637, Adjusted R-squared: 0.7637
## F-statistic: 4.403e+04 on 3 and 40877 DF, p-value: < 2.2e-16
anova(fit1, fit2)
Summary: Both models are significant with very small P-values, and each independent variable in the multiple-regression model is also significant. When we perform an Anova test for significance, we can see that the Multiple Linear model (Fit2) does still hold up and therefore is the better model increasing the Adjusted R-squared from 0.6322 to 0.7637.
Therefore, with Model 2, we can say that for the dependent variable Like, Views, Comment_Count, and Dislikes contribute to 76.37% of the variance.
title_corpus <- Corpus(VectorSource(df$title))
title_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 40881
inspect(title_corpus[1:4])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 4
##
## [1] Eminem - Walk On Water (Audio) ft. Beyoncé
## [2] PLUSH - Bad Unboxing Fan Mail
## [3] Racist Superman | Rudy Mancuso, King Bach & Lele Pons
## [4] I Dare You: GOING BALD!?
Using the tm package, perform transformations on the corpus to clean the text. There are generalized text cleaning activities such as normalization and stop word removal.
# standard cleansing
title_corpus <- tm_map(title_corpus, tolower) # normalize case
## Warning in tm_map.SimpleCorpus(title_corpus, tolower): transformation drops
## documents
title_corpus <- tm_map(title_corpus, removePunctuation) # remove punctuation
## Warning in tm_map.SimpleCorpus(title_corpus, removePunctuation): transformation
## drops documents
title_corpus <- tm_map(title_corpus, removeNumbers) # remove numbers
## Warning in tm_map.SimpleCorpus(title_corpus, removeNumbers): transformation
## drops documents
title_corpus <- tm_map(title_corpus, stripWhitespace) # remove white space
## Warning in tm_map.SimpleCorpus(title_corpus, stripWhitespace): transformation
## drops documents
title_corpus <- tm_map(title_corpus, removeWords, stopwords("english")) # remove stopwords
## Warning in tm_map.SimpleCorpus(title_corpus, removeWords, stopwords("english")):
## transformation drops documents
inspect(title_corpus[1:4])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 4
##
## [1] eminem walk water audio ft beyoncé
## [2] plush bad unboxing fan mail
## [3] racist superman rudy mancuso king bach lele pons
## [4] dare going bald
# stem words using SnowBall stemmer
title_corpus <- tm_map(title_corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(title_corpus, stemDocument): transformation drops
## documents
Create a Term-Document Matrix from the cleaned Corpus
# The term document matrix is where each word/term is a row with documents as columns
dtm <- TermDocumentMatrix(title_corpus)
# inspect
inspect(dtm)
## <<TermDocumentMatrix (terms: 24539, documents: 40881)>>
## Non-/sparse entries: 250374/1002928485
## Sparsity : 100%
## Maximal term length: 81
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 16041 16336 16821 17065 26967 27278 27487 28468 38427 39186
## episod 0 0 0 0 0 0 0 0 0 0
## full 1 1 1 1 1 1 0 0 1 1
## game 0 0 0 0 0 0 0 0 0 0
## music 0 0 0 0 0 0 0 0 0 0
## new 0 0 0 0 0 0 0 0 0 0
## offici 0 0 0 0 0 0 0 0 0 0
## song 0 0 0 0 0 0 0 0 0 0
## trailer 0 0 0 0 0 0 0 0 0 0
## trump 0 0 0 0 0 0 0 0 0 0
## video 0 0 0 0 0 0 0 0 0 0
dtm1 = removeSparseTerms(dtm, 0.99)
freqwords(): find frequent terms in a document-term or term-document matrix.findFreqTerms(dtm1, 5) %>%
head(50)
## [1] "music" "offici" "video" "paul" "new" "final"
## [7] "food" "game" "full" "latest" "punjabi" "song"
## [13] "react" "time" "top" "youtub" "show" "test"
## [19] "real" "feat" "tri" "live" "trump" "season"
## [25] "break" "best" "trailer" "episod" "get" "make"
## [31] "highlight" "part" "life" "day" "first" "movi"
## [37] "drama" "challeng" "nba" "news" "الحلقة" "war"
termCount <- rowSums(as.matrix(dtm1)) # sums rows
termCount <- subset(termCount, termCount >=20)
df2 <- data.frame(term = names(termCount), freq = termCount)
df2 %>%
head(35) %>%
ggplot( aes(x = reorder(term, freq), y = freq, fill= freq)) +
geom_bar(stat = "identity") +
scale_colour_gradientn(colors = terrain.colors(10)) +
theme_classic() +
coord_flip() +
theme(
plot.title = element_text(face = "bold")) +
labs(
x = NULL, y = "Count",
title = "Most Frequently Occuring Words in Titles"
)
description_corpus <- Corpus(VectorSource(df$description))
description_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 40881
inspect(description_corpus[1:2])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 2
##
## [1] Eminem's new track Walk on Water ft. Beyoncé is available everywhere: http://shady.sr/WOWEminem \\nPlaylist Best of Eminem: https://goo.gl/AquNpo\\nSubscribe for more: https://goo.gl/DxCrDV\\n\\nFor more visit: \\nhttp://eminem.com\\nhttp://facebook.com/eminem\\nhttp://twitter.com/eminem\\nhttp://instagram.com/eminem\\nhttp://eminem.tumblr.com\\nhttp://shadyrecords.com\\nhttp://facebook.com/shadyrecords\\nhttp://twitter.com/shadyrecords\\nhttp://instagram.com/shadyrecords\\nhttp://trustshady.tumblr.com\\n\\nMusic video by Eminem performing Walk On Water. (C) 2017 Aftermath Records\\nhttp://vevo.ly/gA7xKt
## [2] STill got a lot of packages. Probably will last for another year. On a side note, more 2nd channel vids soon. editing with premiere from now on, gon' be a tedious transition, but i think it's for the best. \\n\\n__\\n\\nSUBSCRIBE ► http://www.youtube.com/subscription_center?add_user=iDubbbztv\\n\\nMain Channel ► https://www.youtube.com/user/iDubbbzTV\\nSecond Channel ► https://www.youtube.com/channel/UC-tsNNJ3yIW98MtPH6PWFAQ\\nGaming Channel ► https://www.youtube.com/channel/UCVhfFXNY0z3-mbrTh1OYRXA\\n\\nWebsite ► http://www.idubbbz.com/\\n\\nInstagram ► https://instagram.com/idubbbz/\\nTwitter ► https://twitter.com/Idubbbz\\nFacebook ► http://www.facebook.com/IDubbbz\\nTwitch ► http://www.twitch.tv/idubbbz\\n_
Using the tm package, perform transformations on the corpus to clean the text. There are generalized text cleaning activities such as normalization and stop word removal.
# Remove URLs
description_corpus <- tm_map(description_corpus,
content_transformer(function(x) gsub("http[[:alnum:][:punct:]]*", "", x)))
## Warning in tm_map.SimpleCorpus(description_corpus,
## content_transformer(function(x) gsub("http[[:alnum:][:punct:]]*", :
## transformation drops documents
# Replace new line symbols with a space
description_corpus <- tm_map(description_corpus,
content_transformer(function(x) gsub("\\\\n", "", x)))
## Warning in tm_map.SimpleCorpus(description_corpus,
## content_transformer(function(x) gsub("\\\\n", : transformation drops documents
# Remove the odd "arrow" symbol
description_corpus <- tm_map(description_corpus,
content_transformer(function(x) gsub("►", "", x)))
## Warning in tm_map.SimpleCorpus(description_corpus,
## content_transformer(function(x) gsub("►", : transformation drops documents
# Remove the odd "arrow" symbol
description_corpus <- tm_map(description_corpus,
content_transformer(function(x) gsub("▶", "", x)))
## Warning in tm_map.SimpleCorpus(description_corpus,
## content_transformer(function(x) gsub("▶", : transformation drops documents
inspect(description_corpus[1:4])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 4
##
## [1] Eminem's new track Walk on Water ft. Beyoncé is available everywhere: Playlist Best of Eminem: for more: more visit: video by Eminem performing Walk On Water. (C) 2017 Aftermath Records
## [2] STill got a lot of packages. Probably will last for another year. On a side note, more 2nd channel vids soon. editing with premiere from now on, gon' be a tedious transition, but i think it's for the best. __SUBSCRIBE Channel Channel Channel
## [3] WATCH MY PREVIOUS VIDEO SUBSCRIBE FOR WATCHING! LIKE & SUBSCRIBE FOR MORE VIDEOS!-----------------------------------------------------------FIND ME ON: Instagram | | | Rudy Mancuso | Pons | Bach | Effects: Caleb Natale | GregoryShots Studios Channels:Alesso | | Jibawi | Puppets | Stocking | Sarkis | Pons | | Tyson | Rudy Mancuso | Studios |
## [4] I know it's been a while since we did this show, but we're back with what might be the best episode yet!Leave your dares in the comment section! Order my book how to write good Launched New Official Store Channel us mail or whatever you want here!PO Box 232355Las Vegas, NV 89105
# standard cleansing
description_corpus <- tm_map(description_corpus, tolower) # normalize case
## Warning in tm_map.SimpleCorpus(description_corpus, tolower): transformation
## drops documents
description_corpus <- tm_map(description_corpus, removePunctuation) # remove punctuation
## Warning in tm_map.SimpleCorpus(description_corpus, removePunctuation):
## transformation drops documents
description_corpus <- tm_map(description_corpus, removeNumbers) # remove numbers
## Warning in tm_map.SimpleCorpus(description_corpus, removeNumbers):
## transformation drops documents
description_corpus <- tm_map(description_corpus, stripWhitespace) # remove white space
## Warning in tm_map.SimpleCorpus(description_corpus, stripWhitespace):
## transformation drops documents
description_corpus <- tm_map(description_corpus, removeWords, stopwords("english")) # remove stopwords
## Warning in tm_map.SimpleCorpus(description_corpus, removeWords,
## stopwords("english")): transformation drops documents
inspect(description_corpus[1:4])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 4
##
## [1] eminems new track walk water ft beyoncé available everywhere playlist best eminem visit video eminem performing walk water c aftermath records
## [2] still got lot packages probably will last another year side note nd channel vids soon editing premiere now gon tedious transition think best subscribe channel channel channel
## [3] watch previous video subscribe watching like subscribe videosfind instagram rudy mancuso pons bach effects caleb natale gregoryshots studios channelsalesso jibawi puppets stocking sarkis pons tyson rudy mancuso studios
## [4] know since show back might best episode yetleave dares comment section order book write good launched new official store channel us mail whatever want herepo box las vegas nv
# Use the Snowball Stemmer on the Corpus
description_corpus <- tm_map(description_corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(description_corpus, stemDocument): transformation
## drops documents
Create a Term-Document Matrix from the cleaned Corpus
# The term document matrix is where each word/term is a row with documents as columns
description_dtm <- TermDocumentMatrix(description_corpus)
# inspect
inspect(description_dtm)
## <<TermDocumentMatrix (terms: 133227, documents: 40881)>>
## Non-/sparse entries: 2087309/5444365678
## Sparsity : 100%
## Maximal term length: 1561
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 11818 19426 19742 21704 35998 38860 39007 40365 8091 8391
## channel 0 0 0 2 0 0 0 0 0 0
## facebook 0 0 0 0 0 0 0 0 0 0
## get 0 0 0 0 0 0 0 0 0 0
## music 0 0 0 0 4 3 3 0 0 0
## new 0 0 0 0 2 3 3 0 0 0
## show 0 0 0 0 3 3 3 0 0 0
## subscrib 0 0 0 1 2 2 2 0 0 0
## twitter 0 0 0 0 0 0 0 0 0 0
## video 0 0 0 0 6 6 6 0 0 0
## watch 1 0 0 0 0 0 0 0 0 0
description_dtm1 = removeSparseTerms(description_dtm, 0.99)
freqwords(): find frequent terms in a document-term or term-document matrix.findFreqTerms(description_dtm1, 5) %>%
head(50)
## [1] "avail" "best" "new" "perform" "playlist" "record"
## [7] "track" "video" "visit" "walk" "water" "anoth"
## [13] "channel" "edit" "got" "last" "lot" "note"
## [19] "now" "premier" "side" "soon" "still" "subscrib"
## [25] "think" "will" "year" "effect" "instagram" "like"
## [31] "previous" "studio" "watch" "back" "book" "box"
## [37] "comment" "episod" "good" "know" "mail" "might"
## [43] "offici" "order" "section" "show" "sinc" "store"
## [49] "want" "addit"
termCount <- rowSums(as.matrix(description_dtm1)) # sums rows
termCount <- subset(termCount, termCount >=20)
description_df <- data.frame(term = names(termCount), freq = termCount)
description_df %>%
head(35) %>%
ggplot( aes(x = reorder(term, freq), y = freq, fill= freq)) +
geom_bar(stat = "identity") +
scale_colour_gradientn(colors = terrain.colors(10)) +
theme_classic() +
coord_flip() +
theme(
plot.title = element_text(face = "bold")) +
labs(
x = NULL, y = "Count",
title = "Most Frequently Occuring Words in Titles"
)